Ejercicios GLMs y Multivariante

Ejercicio: GLM binomial

Tenemos datos de la presencia de una especie en varios puntos de muestreo en los que también contamos con información de la temperatura media anual (bio_1) y la precipitación anual (bio_12). Queremos saber si exista una relación entre la presencia de la especie en los lugares de muestreo y dichas variables climáticas.

Cargar los datos

library(enmpa)
data(enm_data)
head(enm_data)
  Sp     bio_1 bio_12
1  0  4.222687    403
2  0  6.006802    738
3  0  4.079385    786
4  1  8.418489    453
5  0  8.573750    553
6  1 16.934618    319

Análisis exploratorios

library(ggplot2)
library(patchwork)
p1 <- ggplot(enm_data, aes(x = bio_1, y = Sp)) +
  geom_point(color = "blue") +
  labs(title = "Presencia vs Temperatura") +
  theme_minimal(base_size = 25)
p2 <- ggplot(enm_data, aes(x = bio_12, y = Sp)) +
  geom_point(color = "red") +
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE, color = "blue") +
  labs(title = "Presencia vs Precipitación") +
  theme_minimal(base_size = 25)

p1 + p2  

Ajuste modelo

glm_bino1<-glm(Sp ~ bio_1 * bio_12, enm_data, family = "binomial")
anova(glm_bino1, test = "Chisq")
Analysis of Deviance Table

Model: binomial, link: logit

Response: Sp

Terms added sequentially (first to last)

             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                          5626     3374.9              
bio_1         1   869.35      5625     2505.6 < 2.2e-16 ***
bio_12        1   177.56      5624     2328.0 < 2.2e-16 ***
bio_1:bio_12  1    54.10      5623     2273.9 1.902e-13 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary((glm_bino1))

Call:
glm(formula = Sp ~ bio_1 * bio_12, family = "binomial", data = enm_data)

Coefficients:
               Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -1.6151731  0.5238525  -3.083  0.00205 ** 
bio_1         0.1228181  0.0392011   3.133  0.00173 ** 
bio_12       -0.0112906  0.0013863  -8.144 3.81e-16 ***
bio_1:bio_12  0.0006733  0.0000988   6.815 9.42e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 3374.9  on 5626  degrees of freedom
Residual deviance: 2273.9  on 5623  degrees of freedom
AIC: 2281.9

Number of Fisher Scoring iterations: 8
Supuestos del modelo
par(mfcol=c(2,2))
plot(glm_bino1)
Gráficos de predicciones
# Graficando la temperatura en el eje x

newdat <- with(enm_data, expand.grid(bio_1 = seq(min(bio_1), max(bio_1), length = 100),
                 bio_12 = quantile(bio_12, probs = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9))))
pred <- predict(glm_bino1, newdat, type = "link", se.fit = TRUE)
newdat$pred <- plogis(pred$fit)
newdat$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
newdat$upr <- plogis(pred$fit + 1.96 * pred$se.fit)
p3 <- ggplot(newdat, aes(x = bio_1, y = pred, color = as.factor(bio_12))) +
  geom_line(size = 1.2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, fill = as.factor(bio_12)), 
              alpha = 0.2, color = NA) +
  labs(y = "Probabilidad predicha ± IC95%",
    x = "Temperatura",
    color = "Precipitación (deciles)",
    fill = "Precipitación (deciles)") +
  theme_minimal(base_size = 25)

# Graficando la precipitación en el eje x

newdat2 <- with(enm_data, expand.grid(bio_12 = seq(min(bio_12), max(bio_12), length = 100),
                 bio_1 = quantile(bio_1, probs = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9))))
pred <- predict(glm_bino1, newdat2, type = "link", se.fit = TRUE)
newdat2$pred <- plogis(pred$fit)
newdat2$lwr <- plogis(pred$fit - 1.96 * pred$se.fit)
newdat2$upr <- plogis(pred$fit + 1.96 * pred$se.fit)
p4 <- ggplot(newdat2, aes(x = bio_12, y = pred, color = as.factor(bio_1))) +
  geom_line(size = 1.2) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, fill = as.factor(bio_1)), 
              alpha = 0.2, color = NA) +
labs(x = "Precipitación",
    color = "Temperatura (deciles)",
    fill = "Temperatura (deciles)") +
  theme_minimal(base_size = 25)
p3 + p4

Ejercicio: GLM y multivariante

Estudio sobre el efecto del uso de suelo sobre los polinizadores. En 38 puntos utlizando círculos concéntricos de radio 100, 250, 500 y 750 m, se midieron diferentes coberturas y usos de suelo y, posteriormente, se estimaron el número de polinizadores. Se midieron 8 coberturas distntas: área urbanizada (Prop_imp), hábitats ricos en fores (Prop_fow), infraestructuras doméstcas (incluyendo parques) (Prop_domestcinfrastructure), jardines (Prop_gard), áreas con cobertura arbolada (Prop_tree), suelo agrícola (Prop_ag), áreas abiertas (Prop_open) y carreteras (Prop_road).

Se quiere construir un modelo que nos permita investgar la relación entre el número de polinizadores (peak_colony) y las diferentes coberturas del suelo medidas en un radio de 750 m.

Objetvos:

  1. Reducir la dimensionalidad de las variables de cobertura del suelo y sintetzar toda esta variabilidad en unas pocas variables (máximo 2).

  2. Generar un modelo estadístco que permita explicar el número de polinizadores en función de las característcas del paisaje en términos de tpo de coberturas y usos del suelo.

Cargar los datos

poli <- read.table("polinizacion.txt",header=T)
head(poli)
  Site peak_colony Prop_imp100 Prop_flow100 Prop_domesticinfrastructure100
1   MY        2822   0.0000000   0.59318521                      0.0000000
2   AD           4   0.4304040   0.32636402                      0.8525614
3   AI           1   0.1442782   0.00000000                      0.1965636
4   AN           1   0.7187448   0.21098314                      0.9297279
5   BA           1   0.6546491   0.25044018                      0.8847154
6   BL           1   0.7393377   0.07451079                      0.8464626
  Prop_open100 Prop_tree100 Prop_ag100 Prop_gard100 Prop_road100 Prop_imp250
1    0.5931852   0.40681479  0.5931852   0.00000000  0.000165810  0.00000000
2    0.6547272   0.14743857  0.0000000   0.32636402  0.001915868  0.43888785
3    0.1593226   0.80343637  0.0000000   0.00000000  0.001150937  0.08004157
4    0.4455016   0.07027208  0.0000000   0.21098314  0.001001502  0.67856757
5    0.4621447   0.11528465  0.0000000   0.23006624  0.001006121  0.66598245
6    0.5150527   0.15353738  0.0000000   0.06154875  0.002007025  0.65843379
  Prop_flow250 Prop_domesticinfrastructure250 Prop_open250 Prop_tree250
1   0.49036380                      0.0000000    0.5953584   0.40464156
2   0.26038973                      0.8439741    0.6497614   0.15602591
3   0.03542349                      0.2568828    0.2486090   0.72195257
4   0.18720314                      0.8790103    0.4831285   0.12098970
5   0.26844026                      0.9211543    0.5031796   0.07796024
6   0.09037877                      0.7869442    0.4901593   0.21305575
  Prop_ag250 Prop_gard250 Prop_road250 Prop_imp500 Prop_flow500
1  0.5953584   0.00000000   0.00000000  0.00641708   0.42301659
2  0.0000000   0.25831045   0.24467518  0.39815919   0.23753524
3  0.0000000   0.03542349   0.03559449  0.06351059   0.03557817
4  0.0000000   0.18720314   0.28268581  0.65303104   0.16970027
5  0.0000000   0.25517186   0.24556602  0.64084857   0.15538152
6  0.0000000   0.07349380   0.35106568  0.65646774   0.08767264
  Prop_domesticinfrastructure500 Prop_open500 Prop_tree500 Prop_ag500
1                    0.008224475    0.8377424    0.1598467  0.8238437
2                    0.774511307    0.6145572    0.2006158  0.0000000
3                    0.181777823    0.2104074    0.7635708  0.0000000
4                    0.848094039    0.4644080    0.1519060  0.0000000
5                    0.835894684    0.4831997    0.1602247  0.0000000
6                    0.770305031    0.4731078    0.2186598  0.0000000
  Prop_gard500 Prop_road500 Prop_imp750 Prop_flow750
1   0.00200969  0.004006115  0.03743636   0.37683541
2   0.23133368  0.213135568  0.42767732   0.24789364
3   0.02732077  0.031516881  0.06530442   0.05684796
4   0.15141520  0.269345049  0.59502016   0.15727413
5   0.15145061  0.271735704  0.65149760   0.14945762
6   0.04311380  0.321826508  0.63539017   0.06389764
  Prop_domesticinfrastructure750 Prop_open750 Prop_tree750 Prop_ag750
1                     0.04211462    0.8764714    0.1120512  0.8293233
2                     0.77347431    0.6054083    0.1997195  0.0000000
3                     0.17742188    0.2006209    0.7684559  0.0000000
4                     0.82946686    0.5065353    0.1672214  0.0000000
5                     0.85567774    0.5024216    0.1414423  0.0000000
6                     0.75339212    0.5187730    0.1812005  0.0000000
  Prop_gard750 Prop_road750
1  0.007189833   0.02595887
2  0.219185832   0.23111089
3  0.050209658   0.02984606
4  0.111557562   0.26837617
5  0.146690935   0.28619237
6  0.025606357   0.26069696

Análisis exploratorios

library(corrgram)
corrgram(poli[,27:34], lower.panel = panel.pts,upper.panel = panel.conf, diag.panel = panel.density)

Hacer PCA

pca_poli <- prcomp(poli[,-c(1:26)], scale = T)
summary(pca_poli)
Importance of components:
                          PC1    PC2    PC3     PC4     PC5     PC6     PC7
Standard deviation     2.1535 1.4667 0.9199 0.48302 0.28118 0.19286 0.11714
Proportion of Variance 0.5797 0.2689 0.1058 0.02916 0.00988 0.00465 0.00172
Cumulative Proportion  0.5797 0.8486 0.9544 0.98356 0.99344 0.99809 0.99981
                           PC8
Standard deviation     0.03921
Proportion of Variance 0.00019
Cumulative Proportion  1.00000
pca_poli$rotation[,1:2]
                                      PC1           PC2
Prop_imp750                     0.4400965 -0.0004399965
Prop_flow750                    0.1471257  0.5120752931
Prop_domesticinfrastructure750  0.4584106  0.0367988149
Prop_open750                   -0.2469484  0.5604824442
Prop_tree750                   -0.1559495 -0.5784133168
Prop_ag750                     -0.4154653  0.2582367962
Prop_gard750                    0.3494334  0.1415030037
Prop_road750                    0.4410328  0.0318224679

Interpretar matriz factorial y correlacionar variables originales con PCA

library(factoextra)
fviz_pca_var(pca_poli)

Añadir los dos primeros ejes a los datos

poli$pca1<-pca_poli$x[,1]
poli$pca2<-pca_poli$x[,2]

Hacer modelo

glm_poli1<-glm(peak_colony~pca1+pca2,poli,family = "poisson")
library(car)
Anova(glm_poli1, type = "III")
Analysis of Deviance Table (Type III tests)

Response: peak_colony
     LR Chisq Df Pr(>Chisq)    
pca1   5221.8  1  < 2.2e-16 ***
pca2   9531.8  1  < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Analizar supuestos del modelo

par(mfcol=c(2,2))
plot(glm_poli1)
library(aods3)
gof(glm_poli1)
D  = 999.9372, df = 35, P(>D) = 3.025866e-187
X2 = 1125.487, df = 35, P(>X2) = 1.158689e-213

Modelo binomial negativo

library(MASS)
glm_poli2 <- glm.nb(peak_colony ~ pca1 + pca2, poli)
Anova(glm_poli2, type="III")
Analysis of Deviance Table (Type III tests)

Response: peak_colony
     LR Chisq Df Pr(>Chisq)    
pca1   443.61  1  < 2.2e-16 ***
pca2   166.06  1  < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Supuestos del modelo

par(mfcol=c(2,2))
plot(glm_poli2)
gof(glm_poli2)
D  = 26.2212, df = 35, P(>D) = 0.8579195
X2 = 37.4745, df = 35, P(>X2) = 0.3562873

Interpretación del modelo

summary(glm_poli2)

Call:
glm.nb(formula = peak_colony ~ pca1 + pca2, data = poli, init.theta = 4.995065004, 
    link = log)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  2.53568    0.11051   22.95   <2e-16 ***
pca1        -0.97994    0.04951  -19.79   <2e-16 ***
pca2         0.83538    0.06621   12.62   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(4.9951) family taken to be 1)

    Null deviance: 953.975  on 37  degrees of freedom
Residual deviance:  26.221  on 35  degrees of freedom
AIC: 263.82

Number of Fisher Scoring iterations: 1

              Theta:  5.00 
          Std. Err.:  1.41 

 2 x log-likelihood:  -255.823 
library(visreg)
par(mfcol=c(2,2))
visreg(glm_poli2, scale="response")